{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "import keras\n",
    "from keras.datasets import mnist\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout\n",
    "test_size=0.1\n",
    "batch_size = 1024\n",
    "epochs = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('gene_expression_regression.csv')\n",
    "data_array=data.iloc[:,1:].values\n",
    "X = data_array[:,:943]\n",
    "Y = data_array[:,-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Dense(512, activation='relu', input_shape=(X.shape[1],)))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(512, activation='relu'))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Dense(1, activation='tanh'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "dense_4 (Dense)              (None, 512)               483328    \n",
      "_________________________________________________________________\n",
      "dropout_3 (Dropout)          (None, 512)               0         \n",
      "_________________________________________________________________\n",
      "dense_5 (Dense)              (None, 512)               262656    \n",
      "_________________________________________________________________\n",
      "dropout_4 (Dropout)          (None, 512)               0         \n",
      "_________________________________________________________________\n",
      "dense_6 (Dense)              (None, 1)                 513       \n",
      "=================================================================\n",
      "Total params: 746,497\n",
      "Trainable params: 746,497\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "Train on 116242 samples, validate on 12916 samples\n",
      "Epoch 1/20\n",
      "116242/116242 [==============================] - 5s 41us/step - loss: 0.3788 - mean_absolute_error: 0.4372 - val_loss: 0.2266 - val_mean_absolute_error: 0.3308\n",
      "Epoch 2/20\n",
      "116242/116242 [==============================] - 1s 11us/step - loss: 0.2125 - mean_absolute_error: 0.3275 - val_loss: 0.1861 - val_mean_absolute_error: 0.3059\n",
      "Epoch 3/20\n",
      "116242/116242 [==============================] - 1s 12us/step - loss: 0.1829 - mean_absolute_error: 0.3038 - val_loss: 0.1732 - val_mean_absolute_error: 0.2933\n",
      "Epoch 4/20\n",
      "116242/116242 [==============================] - 1s 11us/step - loss: 0.1718 - mean_absolute_error: 0.2937 - val_loss: 0.1707 - val_mean_absolute_error: 0.2915\n",
      "Epoch 5/20\n",
      "116242/116242 [==============================] - 1s 13us/step - loss: 0.1623 - mean_absolute_error: 0.2847 - val_loss: 0.1680 - val_mean_absolute_error: 0.2866\n",
      "Epoch 6/20\n",
      "116242/116242 [==============================] - 1s 11us/step - loss: 0.1556 - mean_absolute_error: 0.2777 - val_loss: 0.1652 - val_mean_absolute_error: 0.2846\n",
      "Epoch 7/20\n",
      "116242/116242 [==============================] - 1s 9us/step - loss: 0.1506 - mean_absolute_error: 0.2722 - val_loss: 0.1635 - val_mean_absolute_error: 0.2821\n",
      "Epoch 8/20\n",
      "116242/116242 [==============================] - 1s 8us/step - loss: 0.1473 - mean_absolute_error: 0.2681 - val_loss: 0.1591 - val_mean_absolute_error: 0.2758\n",
      "Epoch 9/20\n",
      "116242/116242 [==============================] - 1s 10us/step - loss: 0.1441 - mean_absolute_error: 0.2647 - val_loss: 0.1586 - val_mean_absolute_error: 0.2759\n",
      "Epoch 10/20\n",
      "116242/116242 [==============================] - 1s 12us/step - loss: 0.1407 - mean_absolute_error: 0.2602 - val_loss: 0.1563 - val_mean_absolute_error: 0.2729\n",
      "Epoch 11/20\n",
      "116242/116242 [==============================] - 1s 12us/step - loss: 0.1395 - mean_absolute_error: 0.2592 - val_loss: 0.1593 - val_mean_absolute_error: 0.2770\n",
      "Epoch 12/20\n",
      "116242/116242 [==============================] - 1s 12us/step - loss: 0.1370 - mean_absolute_error: 0.2556 - val_loss: 0.1577 - val_mean_absolute_error: 0.2753\n",
      "Epoch 13/20\n",
      "116242/116242 [==============================] - 1s 12us/step - loss: 0.1348 - mean_absolute_error: 0.2528 - val_loss: 0.1543 - val_mean_absolute_error: 0.2705\n",
      "Epoch 14/20\n",
      "116242/116242 [==============================] - 1s 10us/step - loss: 0.1334 - mean_absolute_error: 0.2506 - val_loss: 0.1537 - val_mean_absolute_error: 0.2677\n",
      "Epoch 15/20\n",
      "116242/116242 [==============================] - 1s 8us/step - loss: 0.1319 - mean_absolute_error: 0.2486 - val_loss: 0.1536 - val_mean_absolute_error: 0.2680\n",
      "Epoch 16/20\n",
      "116242/116242 [==============================] - 1s 9us/step - loss: 0.1307 - mean_absolute_error: 0.2468 - val_loss: 0.1526 - val_mean_absolute_error: 0.2681\n",
      "Epoch 17/20\n",
      "116242/116242 [==============================] - 1s 11us/step - loss: 0.1295 - mean_absolute_error: 0.2449 - val_loss: 0.1522 - val_mean_absolute_error: 0.2674\n",
      "Epoch 18/20\n",
      "116242/116242 [==============================] - 1s 8us/step - loss: 0.1285 - mean_absolute_error: 0.2436 - val_loss: 0.1513 - val_mean_absolute_error: 0.2667\n",
      "Epoch 19/20\n",
      "116242/116242 [==============================] - 1s 9us/step - loss: 0.1276 - mean_absolute_error: 0.2420 - val_loss: 0.1538 - val_mean_absolute_error: 0.2670\n",
      "Epoch 20/20\n",
      "116242/116242 [==============================] - 1s 10us/step - loss: 0.1265 - mean_absolute_error: 0.2402 - val_loss: 0.1521 - val_mean_absolute_error: 0.2646\n",
      "('Test loss:', 0.15210176009388301)\n",
      "('Test accuracy:', 0.26464902865476897)\n"
     ]
    }
   ],
   "source": [
    "model.summary()\n",
    "\n",
    "model.compile(loss='mean_squared_error',\n",
    "              optimizer=keras.optimizers.Adam(),\n",
    "              metrics=['mae'])\n",
    "\n",
    "history = model.fit(x_train, y_train,\n",
    "                    batch_size=batch_size,\n",
    "                    epochs=epochs,\n",
    "                    verbose=1,\n",
    "                    validation_data=(x_test, y_test))\n",
    "score = model.evaluate(x_test, y_test, verbose=0)\n",
    "print('Test loss:', score[0])\n",
    "print('Test accuracy:', score[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred=model.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.00472247,  0.9925601 , -0.5268675 , ..., -1.105304  ,\n",
       "        1.6052095 ,  1.2793118 ])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.93710857],\n",
       "       [0.93710857, 1.        ]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.corrcoef(y_test.flatten(), y_pred.flatten())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Baseline linear regression model result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reg = LinearRegression()\n",
    "reg.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.90128197],\n",
       "       [0.90128197, 1.        ]])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.corrcoef(reg.predict(x_test).flatten(), y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
